The dataset I am using for Smartwatch data analysis is publicly available on Kaggle. This dataset was initially collected from 30 female users of the Fitbit smartwatch. You can download the dataset from here.
I will start the task of Smartwatch Data Analysis by importing the necessary Python libraries and the dataset:
import pandas as pd
import numpy as np
import requests
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
url = "https://raw.githubusercontent.com/amankharwal/Website-data/master/dailyActivity_merged.csv"
df = pd.read_csv(url)
print(df.head(5))
Id ActivityDate TotalSteps TotalDistance TrackerDistance \ 0 1503960366 4/12/2016 13162 8.50 8.50 1 1503960366 4/13/2016 10735 6.97 6.97 2 1503960366 4/14/2016 10460 6.74 6.74 3 1503960366 4/15/2016 9762 6.28 6.28 4 1503960366 4/16/2016 12669 8.16 8.16 LoggedActivitiesDistance VeryActiveDistance ModeratelyActiveDistance \ 0 0.0 1.88 0.55 1 0.0 1.57 0.69 2 0.0 2.44 0.40 3 0.0 2.14 1.26 4 0.0 2.71 0.41 LightActiveDistance SedentaryActiveDistance VeryActiveMinutes \ 0 6.06 0.0 25 1 4.71 0.0 21 2 3.91 0.0 30 3 2.83 0.0 29 4 5.04 0.0 36 FairlyActiveMinutes LightlyActiveMinutes SedentaryMinutes Calories 0 13 328 728 1985 1 19 217 776 1797 2 11 181 1218 1776 3 34 209 726 1745 4 10 221 773 1863
#Check if the data set has NULL values
print(df.isnull().sum())
Id 0 ActivityDate 0 TotalSteps 0 TotalDistance 0 TrackerDistance 0 LoggedActivitiesDistance 0 VeryActiveDistance 0 ModeratelyActiveDistance 0 LightActiveDistance 0 SedentaryActiveDistance 0 VeryActiveMinutes 0 FairlyActiveMinutes 0 LightlyActiveMinutes 0 SedentaryMinutes 0 Calories 0 dtype: int64
#Look up information about the columns in the dataframe
print(df.info())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 940 entries, 0 to 939 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Id 940 non-null int64 1 ActivityDate 940 non-null object 2 TotalSteps 940 non-null int64 3 TotalDistance 940 non-null float64 4 TrackerDistance 940 non-null float64 5 LoggedActivitiesDistance 940 non-null float64 6 VeryActiveDistance 940 non-null float64 7 ModeratelyActiveDistance 940 non-null float64 8 LightActiveDistance 940 non-null float64 9 SedentaryActiveDistance 940 non-null float64 10 VeryActiveMinutes 940 non-null int64 11 FairlyActiveMinutes 940 non-null int64 12 LightlyActiveMinutes 940 non-null int64 13 SedentaryMinutes 940 non-null int64 14 Calories 940 non-null int64 dtypes: float64(7), int64(7), object(1) memory usage: 110.3+ KB None
#The column containing the date of the record is an object. We may need to use dates in our analysis.
# Changing datatype of ActivityDate
df["ActivityDate"] = pd.to_datetime(df["ActivityDate"],format="%m/%d/%Y")
print(df.info())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 940 entries, 0 to 939 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Id 940 non-null int64 1 ActivityDate 940 non-null datetime64[ns] 2 TotalSteps 940 non-null int64 3 TotalDistance 940 non-null float64 4 TrackerDistance 940 non-null float64 5 LoggedActivitiesDistance 940 non-null float64 6 VeryActiveDistance 940 non-null float64 7 ModeratelyActiveDistance 940 non-null float64 8 LightActiveDistance 940 non-null float64 9 SedentaryActiveDistance 940 non-null float64 10 VeryActiveMinutes 940 non-null int64 11 FairlyActiveMinutes 940 non-null int64 12 LightlyActiveMinutes 940 non-null int64 13 SedentaryMinutes 940 non-null int64 14 Calories 940 non-null int64 dtypes: datetime64[ns](1), float64(7), int64(7) memory usage: 110.3 KB None
#Combine the very active, fairly active, lightly active, and sedentary minutes in the dataset
#into a total minutes column to append to the dataframe
df["TotalMinutes"] = df["VeryActiveMinutes"] + df["FairlyActiveMinutes"] + df["LightlyActiveMinutes"] + df["SedentaryMinutes"]
print(df["TotalMinutes"].sample(5))
65 1440 275 1440 793 1365 257 1023 371 1440 Name: TotalMinutes, dtype: int64
#look at the descriptive statistics
print(df.describe())
Id TotalSteps TotalDistance TrackerDistance \
count 9.400000e+02 940.000000 940.000000 940.000000
mean 4.855407e+09 7637.910638 5.489702 5.475351
std 2.424805e+09 5087.150742 3.924606 3.907276
min 1.503960e+09 0.000000 0.000000 0.000000
25% 2.320127e+09 3789.750000 2.620000 2.620000
50% 4.445115e+09 7405.500000 5.245000 5.245000
75% 6.962181e+09 10727.000000 7.712500 7.710000
max 8.877689e+09 36019.000000 28.030001 28.030001
LoggedActivitiesDistance VeryActiveDistance ModeratelyActiveDistance \
count 940.000000 940.000000 940.000000
mean 0.108171 1.502681 0.567543
std 0.619897 2.658941 0.883580
min 0.000000 0.000000 0.000000
25% 0.000000 0.000000 0.000000
50% 0.000000 0.210000 0.240000
75% 0.000000 2.052500 0.800000
max 4.942142 21.920000 6.480000
LightActiveDistance SedentaryActiveDistance VeryActiveMinutes \
count 940.000000 940.000000 940.000000
mean 3.340819 0.001606 21.164894
std 2.040655 0.007346 32.844803
min 0.000000 0.000000 0.000000
25% 1.945000 0.000000 0.000000
50% 3.365000 0.000000 4.000000
75% 4.782500 0.000000 32.000000
max 10.710000 0.110000 210.000000
FairlyActiveMinutes LightlyActiveMinutes SedentaryMinutes \
count 940.000000 940.000000 940.000000
mean 13.564894 192.812766 991.210638
std 19.987404 109.174700 301.267437
min 0.000000 0.000000 0.000000
25% 0.000000 127.000000 729.750000
50% 6.000000 199.000000 1057.500000
75% 19.000000 264.000000 1229.500000
max 143.000000 518.000000 1440.000000
Calories TotalMinutes
count 940.000000 940.000000
mean 2303.609574 1218.753191
std 718.166862 265.931767
min 0.000000 2.000000
25% 1828.500000 989.750000
50% 2134.000000 1440.000000
75% 2793.250000 1440.000000
max 4900.000000 1440.000000
#Ivisualize the relationship between calories burned and total steps walked in a data using a scatter plot
figure = px.scatter(data_frame = df, x="Calories",
y="TotalSteps", size="VeryActiveMinutes",
trendline="ols",
title="Relationship between Calories & Total Steps")
figure.show()
There looks to be a linear relationship between the total number of stelps and the number of calories burned in a day.
#Visualize the averal total number of active minutes in a day
label = ["Very Active Minutes", "Fairly Active Minutes", "Lightly Active Minutes", "Inactive Minutes"]
counts = df[["VeryActiveMinutes", "FairlyActiveMinutes", "LightlyActiveMinutes", "SedentaryMinutes"]].mean()
colors = ['gold','lightgreen', "pink", "blue"]
fig = go.Figure(data=[go.Pie(labels=label, values=counts)])
fig.update_layout(title_text='Total Active Minutes')
fig.update_traces(hoverinfo='label+percent', textinfo='value', textfont_size=30,
marker=dict(colors=colors, line=dict(color='black', width=3)))
fig.show()
Observations:
81.3% of Total inactive minutes in a day
15.8% of Lightly active minutes in a day
On an average, only 21 minutes (1.74%) were very active
1.11% (13 minutes) of fairly active minutes in a day
#Use the ActivityDate column to find the weekdays of the records and add a new column to this dataset as “Day”
df["Day"] = df["ActivityDate"].dt.day_name()
print(df["Day"].head())
0 Tuesday 1 Wednesday 2 Thursday 3 Friday 4 Saturday Name: Day, dtype: object
#visualize the very active, fairly active, and lightly active minutes on each day of the week
fig = go.Figure()
fig.add_trace(go.Bar(
x=df["Day"],
y=df["VeryActiveMinutes"],
name='Very Active',
marker_color='purple'
))
fig.add_trace(go.Bar(
x=df["Day"],
y=df["FairlyActiveMinutes"],
name='Fairly Active',
marker_color='green'
))
fig.add_trace(go.Bar(
x=df["Day"],
y=df["LightlyActiveMinutes"],
name='Lightly Active',
marker_color='pink'
))
fig.update_layout(barmode='group', xaxis_tickangle=-45)
fig.show()
#Visualize the number of inactive minutes on each day of the week:
day = df["Day"].value_counts()
label = day.index
counts = df["SedentaryMinutes"]
colors = ['gold','lightgreen', "pink", "blue", "skyblue", "cyan", "orange"]
fig = go.Figure(data=[go.Pie(labels=label, values=counts)])
fig.update_layout(title_text='Inactive Minutes Daily')
fig.update_traces(hoverinfo='label+percent', textinfo='value', textfont_size=30,
marker=dict(colors=colors, line=dict(color='black', width=3)))
fig.show()
#Visualize calories burned per day by day of the week
calories = df["Day"].value_counts()
label = calories.index
counts = df["Calories"]
colors = ['gold','lightgreen', "pink", "blue", "skyblue", "cyan", "orange"]
fig = go.Figure(data=[go.Pie(labels=label, values=counts)])
fig.update_layout(title_text='Calories Burned Daily')
fig.update_traces(hoverinfo='label+percent', textinfo='value', textfont_size=30,
marker=dict(colors=colors, line=dict(color='black', width=3)))
fig.show()